Detected ASvs
df <- as.data.frame(sample_sums(phy))
colnames(df) <- c("Number_of_ASVs")
df %>% kable(booktabs = TRUE, longtable = TRUE) %>% kable_styling(latex_options = c("hold_position",
"repeat_header", position = "")) %>% scroll_box(width = "100%", height = "200px")
|
|
Number_of_ASVs
|
|
S1
|
63230
|
|
S100
|
47012
|
|
S101
|
101156
|
|
S102
|
111762
|
|
S103
|
280956
|
|
S104
|
263280
|
|
S105
|
125068
|
|
S106
|
304143
|
|
S107
|
102080
|
|
S108
|
235556
|
|
S109
|
240091
|
|
S11
|
73437
|
|
S111
|
234357
|
|
S112
|
35362
|
|
S114
|
241630
|
|
S115
|
64631
|
|
S116
|
127471
|
|
S117
|
194595
|
|
S118
|
11727
|
|
S119
|
215206
|
|
S120
|
1075421
|
|
S121
|
233627
|
|
S122
|
287572
|
|
S123
|
165222
|
|
S124
|
91555
|
|
S126
|
260483
|
|
S127
|
400493
|
|
S128
|
171973
|
|
S129
|
65591
|
|
S130
|
290641
|
|
S131
|
166106
|
|
S132
|
70909
|
|
S134
|
81964
|
|
S135
|
241499
|
|
S137
|
41925
|
|
S138
|
70297
|
|
S139
|
241951
|
|
S14
|
164789
|
|
S140
|
20706
|
|
S141
|
96494
|
|
S142
|
28144
|
|
S143
|
229699
|
|
S145
|
58180
|
|
S15
|
126667
|
|
S16
|
31290
|
|
S17
|
159843
|
|
S18
|
40422
|
|
S2
|
134381
|
|
S21
|
9807
|
|
S24
|
68727
|
|
S25
|
104068
|
|
S26
|
93601
|
|
S28
|
39612
|
|
S29
|
61703
|
|
S3
|
19223
|
|
S31
|
68789
|
|
S33
|
22368
|
|
S34
|
128892
|
|
S35
|
94267
|
|
S37
|
109080
|
|
S38
|
91271
|
|
S39
|
73717
|
|
S4
|
130053
|
|
S40
|
57703
|
|
S42
|
111092
|
|
S43
|
20532
|
|
S44
|
56180
|
|
S46
|
61731
|
|
S47
|
123685
|
|
S48
|
18249
|
|
S50
|
79458
|
|
S51
|
68707
|
|
S52
|
11937
|
|
S53
|
117322
|
|
S54
|
125443
|
|
S55
|
126926
|
|
S56
|
86093
|
|
S57
|
118198
|
|
S58
|
76489
|
|
S59
|
137027
|
|
S60
|
88624
|
|
S61
|
122507
|
|
S62
|
149594
|
|
S63
|
84432
|
|
S64
|
110986
|
|
S65
|
172466
|
|
S66
|
49984
|
|
S67
|
201620
|
|
S68
|
51369
|
|
S69
|
102936
|
|
S71
|
107829
|
|
S72
|
98095
|
|
S74
|
101384
|
|
S75
|
151991
|
|
S77
|
77881
|
|
S78
|
158062
|
|
S79
|
84411
|
|
S8
|
59270
|
|
S80
|
26477
|
|
S81
|
20615
|
|
S82
|
224827
|
|
S83
|
158696
|
|
S84
|
228773
|
|
S85
|
206129
|
|
S86
|
39624
|
|
S88
|
22891
|
|
S89
|
118506
|
|
S9
|
62794
|
|
S90
|
30077
|
|
S92
|
47402
|
|
S93
|
182728
|
|
S94
|
39212
|
|
S95
|
11159
|
|
S96
|
176939
|
|
S97
|
262579
|
|
S98
|
303543
|
phy <- subset_samples(phy, !is.na(Inflammation))
phy <- subset_samples(phy, sample_sums(phy) > 5000)
phy
phyloseq-class experiment-level object
otu_table() OTU Table: [ 4048 taxa and 114 samples ]
sample_data() Sample Data: [ 114 samples by 5 sample variables ]
tax_table() Taxonomy Table: [ 4048 taxa by 8 taxonomic ranks ]
phy_tree() Phylogenetic Tree: [ 4048 tips and 4041 internal nodes ]
total = median(sample_sums(phy))
standf = function(x, t = total) round(t * (x/sum(x)))
M.std = transform_sample_counts(phy, standf)
M.std = filter_taxa(M.std, function(x) sum(x > 10) > (0.02 * length(x)) | sum(x) >
0.001 * total, TRUE)
# M.f = filter_taxa(M.std,function(x) sum(x) > 0.001*total, TRUE)
ntaxa(M.std)
[1] 1047
require(cluster)
library(factoextra)
df1 <- data.frame(otu_table(M.std))
df_viz <- fviz_nbclust(df1, kmeans, method = "wss") + theme_minimal()
res_fanny <- fanny(t(df1), k = 3, metric = "SqEuclidean")
res_fanny <- pam(t(df1), k = 3)
sample_data(M.std)$cluster <- as.character(res_fanny$clustering)
# clusters<-NULL memb_prob<-res_fanny$membership
# sample_ids<-rownames(memb_prob) for (i in 1:dim(memb_prob)[1]) {
# memb_prob_<-max(memb_prob[i,]) tmp<-data.frame(sample_ids[i], memb_prob_)
# clusters<-rbind(clusters, tmp) }
require(microbiomeSeq)
require(pheatmap)
phyTop <- taxa_level(M.std, "G_species")
TopASVs <- names(sort(taxa_sums(phyTop), TRUE))[1:50]
df1 <- as.data.frame(otu_table(t(phyTop)))
df1 <- df1[which(rownames(df1) %in% TopASVs), ]
df1 <- df1[which(rownames(df1) != " "), ]
df <- data.frame((sample_data(phyTop))[, c("Inflammation", "BV")])
p <- pheatmap(log2(df1 + 1), cluster_rows = FALSE, show_rownames = TRUE, show_colnames = F,
cluster_cols = TRUE, annotation_col = df, annotation_row = NA)
p

p <- plot_richness(M.std, x = "cluster", color = "cluster", measures = c("Observed",
"Simpson"), title = paste0("Standardized to total reads, N=", nsamples(M.std)))
p <- p + geom_boxplot() + geom_point() + theme_bw() + theme(axis.text = element_text(size = 10,
face = "bold"), axis.title = element_text(size = 16, face = "bold")) + geom_point(size = 2)
p

p <- plot_richness(M.std, x = "Inflammation", color = "Inflammation", measures = c("Observed",
"Simpson"), title = paste0("Standardized to total reads, N=", nsamples(M.std)))
p <- p + geom_boxplot() + geom_point() + theme_bw() + theme(axis.text = element_text(size = 10,
face = "bold"), axis.title = element_text(size = 16, face = "bold")) + geom_point(size = 2)
p

est <- estimate_richness(M.std, split = TRUE, measures = c("Simpson"))
BV_div <- cbind(est, sample_data(M.std)[, "BV"])
t <- wilcox.test(BV_div$Simpson ~ BV_div$BV)
t
Wilcoxon rank sum test with continuity correction
data: BV_div$Simpson by BV_div$BV
W = 467, p-value = 5.493e-11
alternative hypothesis: true location shift is not equal to 0
BInflammation_div <- cbind(est, sample_data(M.std)[, "Inflammation"])
t <- wilcox.test(BInflammation_div$Simpson ~ BInflammation_div$Inflammation)
t
Wilcoxon rank sum test with continuity correction
data: BInflammation_div$Simpson by BInflammation_div$Inflammation
W = 2420, p-value = 2.206e-06
alternative hypothesis: true location shift is not equal to 0
Cluster_div <- cbind(est, sample_data(M.std)[, "cluster"])
dunn.test::dunn.test(Cluster_div$Simpson, Cluster_div$cluster)
Kruskal-Wallis rank sum test
data: x and group
Kruskal-Wallis chi-squared = 60.6833, df = 2, p-value = 0
Comparison of x by group
(No adjustment)
Col Mean-|
Row Mean | 1 2
---------+----------------------
2 | -6.491687
| 0.0000*
|
3 | -1.269108 6.204012
| 0.1022 0.0000*
alpha = 0.05
Reject Ho if p <= alpha/2
# ordination based on NMDS and bray-curtis distance
NMDS_bray <- ordinate(M.std, "NMDS")
Square root transformation
Wisconsin double standardization
Run 0 stress 0.1998966
Run 1 stress 0.2130476
Run 2 stress 0.2171235
Run 3 stress 0.2102853
Run 4 stress 0.2095962
Run 5 stress 0.2181037
Run 6 stress 0.219483
Run 7 stress 0.2124743
Run 8 stress 0.2054231
Run 9 stress 0.225887
Run 10 stress 0.2165795
Run 11 stress 0.2104146
Run 12 stress 0.2189583
Run 13 stress 0.2021484
Run 14 stress 0.222584
Run 15 stress 0.2246392
Run 16 stress 0.2129191
Run 17 stress 0.2092465
Run 18 stress 0.209589
Run 19 stress 0.2216952
Run 20 stress 0.2298562
*** No convergence -- monoMDS stopping criteria:
20: stress ratio > sratmax
title = c("NMDS of 16S microbiome, Bray-Curtis distance")
NMDS.bray.plot <- plot_ordination(M.std, NMDS_bray, color = "BV", shape = "Inflammation",
title = title)
NMDS.bray.plot <- NMDS.bray.plot + theme(axis.text = element_text(size = 16,
face = "bold"), axis.title = element_text(size = 18, face = "bold"), legend.title = element_text(size = 14)) +
theme_bw() + labs(color = "BV") + geom_point(size = 5)
NMDS.bray.plot

PCoA.ord.bray <- ordinate(M.std, "PCoA", "bray")
title = c("PCoA of 16S microbiome, Bray-Curtis distance")
PCoA.ord <- plot_ordination(M.std, PCoA.ord.bray, color = "cluster", shape = "BV",
title = title)
PCoA.ord <- PCoA.ord + theme(axis.text = element_text(size = 16, face = "bold"),
axis.title = element_text(size = 18, face = "bold"), legend.title = element_text(size = 14)) +
theme_bw() + labs(color = "cluster") + geom_point(size = 5)
PCoA.ord

# permanova
require(vegan)
phy_bray <- phyloseq::distance(M.std, method = "bray")
sampledf <- data.frame(sample_data(M.std))
adonis(phy_bray ~ BV, data = sampledf)
Call:
adonis(formula = phy_bray ~ BV, data = sampledf)
Permutation: free
Number of permutations: 999
Terms added sequentially (first to last)
Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
BV 1 6.973 6.9729 24.445 0.17916 0.001 ***
Residuals 112 31.948 0.2852 0.82084
Total 113 38.921 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
beta <- betadisper(phy_bray, sampledf$BV)
permutest(beta)
Permutation test for homogeneity of multivariate dispersions
Permutation: free
Number of permutations: 999
Response: Distances
Df Sum Sq Mean Sq F N.Perm Pr(>F)
Groups 1 0.00321 0.0032139 0.1963 999 0.648
Residuals 112 1.83390 0.0163741
adonis(phy_bray ~ Inflammation, data = sampledf)
Call:
adonis(formula = phy_bray ~ Inflammation, data = sampledf)
Permutation: free
Number of permutations: 999
Terms added sequentially (first to last)
Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
Inflammation 1 3.595 3.5948 11.397 0.09236 0.001 ***
Residuals 112 35.326 0.3154 0.90764
Total 113 38.921 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
beta <- betadisper(phy_bray, sampledf$Inflammation)
permutest(beta)
Permutation test for homogeneity of multivariate dispersions
Permutation: free
Number of permutations: 999
Response: Distances
Df Sum Sq Mean Sq F N.Perm Pr(>F)
Groups 1 0.00128 0.0012817 0.0985 999 0.741
Residuals 112 1.45685 0.0130076
adonis(phy_bray ~ cluster, data = sampledf)
Call:
adonis(formula = phy_bray ~ cluster, data = sampledf)
Permutation: free
Number of permutations: 999
Terms added sequentially (first to last)
Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
cluster 2 17.106 8.5530 43.52 0.43951 0.001 ***
Residuals 111 21.815 0.1965 0.56049
Total 113 38.921 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
beta <- betadisper(phy_bray, sampledf$BV)
permutest(beta)
Permutation test for homogeneity of multivariate dispersions
Permutation: free
Number of permutations: 999
Response: Distances
Df Sum Sq Mean Sq F N.Perm Pr(>F)
Groups 1 0.00321 0.0032139 0.1963 999 0.651
Residuals 112 1.83390 0.0163741
sampledf$cluster <- as.factor(sampledf$cluster)
logit_model <- glm(Inflammation ~ cluster + BMI + Age, family = binomial(link = "logit"),
data = sampledf)
summary(logit_model)
Call:
glm(formula = Inflammation ~ cluster + BMI + Age, family = binomial(link = "logit"),
data = sampledf)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.8426 -0.7226 -0.6225 0.9218 1.8866
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.54362 2.95825 0.184 0.854
cluster2 -2.52439 0.64857 -3.892 9.93e-05 ***
cluster3 -0.61320 0.63878 -0.960 0.337
BMI 0.04109 0.05115 0.803 0.422
Age -0.02309 0.14450 -0.160 0.873
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 141.14 on 102 degrees of freedom
Residual deviance: 116.86 on 98 degrees of freedom
(11 observations deleted due to missingness)
AIC: 126.86
Number of Fisher Scoring iterations: 4
exp(logit_model$coefficients)
(Intercept) cluster2 cluster3 BMI Age
1.72223820 0.08010712 0.54161358 1.04194406 0.97717878
exp(confint(logit_model))
2.5 % 97.5 %
(Intercept) 0.004740475 573.4388037
cluster2 0.020390051 0.2672824
cluster3 0.144419698 1.8330961
BMI 0.942085319 1.1536758
Age 0.734927320 1.3023893
anova(logit_model, test = "Chisq")
Analysis of Deviance Table
Model: binomial, link: logit
Response: Inflammation
Terms added sequentially (first to last)
Df Deviance Resid. Df Resid. Dev Pr(>Chi)
NULL 102 141.14
cluster 2 23.6062 100 117.54 7.481e-06 ***
BMI 1 0.6494 99 116.89 0.4203
Age 1 0.0255 98 116.86 0.8731
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Differential abundance using DESEQ2
require(DESeq2)
M.f.des <- taxa_level(M.std, "G_species")
dds <- phyloseq_to_deseq2(M.f.des, ~Inflammation)
# geometric mean, set to zero when all coordinates are zero
geo_mean_protected <- function(x) {
if (all(x == 0)) {
return(0)
}
exp(mean(log(x[x != 0])))
}
geoMeans <- apply(counts(dds), 1, geo_mean_protected)
dds <- estimateSizeFactors(dds, geoMeans = geoMeans)
dds <- DESeq(dds, fitType = "local")
res = results(dds, cooksCutoff = FALSE)
df <- as.data.frame(colData(dds)[, "BV"])
colnames(df) <- "BV"
alpha = 0.01
sigtab = as.data.frame(res[which(res$padj < alpha & abs(res$log2FoldChange) >
1.5), ])
# sigtab = cbind(as(sigtab, 'data.frame'),
# as(tax_table(M.f.des)[rownames(sigtab), ], 'matrix'))
dim(sigtab)
[1] 18 6
table_sig <- sigtab[c("log2FoldChange", "padj")]
table_sig$Abundant_Group <- levels(df$BV)[as.numeric(sigtab$log2FoldChange >
0) + 1]
table_sig %>% kable(booktabs = TRUE, longtable = TRUE) %>% kable_styling(latex_options = c("hold_position",
"repeat_header", position = "")) %>% scroll_box(width = "100%", height = "300px")
|
|
log2FoldChange
|
padj
|
Abundant_Group
|
|
Sneathia uncultured bacterium
|
-27.343938
|
0.0000000
|
Negative
|
|
Sneathia
|
-3.831081
|
0.0033540
|
Negative
|
|
Sneathia Sneathia amnii
|
-2.463017
|
0.0002363
|
Negative
|
|
Streptococcus Streptococcus anginosus subsp. anginosus
|
3.611588
|
0.0048922
|
Positive
|
|
Gemella Gemella asaccharolytica
|
-2.533120
|
0.0030237
|
Negative
|
|
Lactobacillus
|
2.105281
|
0.0050189
|
Positive
|
|
Anaerococcus uncultured Anaerococcus sp.
|
2.097034
|
0.0030237
|
Positive
|
|
Fastidiosipila Clostridiales bacterium oral clone MCE3_9
|
1.869674
|
0.0025170
|
Positive
|
|
Fastidiosipila unidentified
|
-3.753272
|
0.0011412
|
Negative
|
|
Shuttleworthia uncultured bacterium
|
-6.049291
|
0.0000000
|
Negative
|
|
Veillonella
|
2.832508
|
0.0051103
|
Positive
|
|
Megasphaera
|
-3.167728
|
0.0000000
|
Negative
|
|
Gardnerella
|
-2.850634
|
0.0094426
|
Negative
|
|
Sutterella Sutterella sp. Marseille-P3161
|
-5.242801
|
0.0065728
|
Negative
|
|
Porphyromonas Bacteroidales bacterium KA00251
|
-4.475867
|
0.0001637
|
Negative
|
|
Prevotella
|
-5.120501
|
0.0000000
|
Negative
|
|
Prevotella 6 Prevotella sp. S7-1-8
|
-6.571508
|
0.0000000
|
Negative
|
|
Prevotella Prevotella sp. DNF00663
|
-4.039743
|
0.0048922
|
Negative
|
library("pheatmap")
select <- order(rowMeans(counts(dds, normalized = TRUE)), decreasing = TRUE)[1:50]
df <- as.data.frame(colData(dds)[, c("Inflammation", "BV", "cluster")])
df1 <- data.frame(assay(dds))[select, ]
p <- pheatmap(log2(df1 + 1), cluster_rows = FALSE, show_rownames = TRUE, show_colnames = F,
cluster_cols = TRUE, annotation_col = df, annotation_row = NA)
p

require(randomForest)
data.rf <- data.frame(otu_table(M.f.des))
data.rf$BV <- sample_data(M.std)$BV
BV.rf <- randomForest(BV ~ ., data = data.rf, importance = TRUE, proximity = TRUE)
df_accuracy <- as.data.frame(BV.rf$importance)
df_accuracy$Taxa <- rownames(df_accuracy)
df_accuracy <- df_accuracy[order(df_accuracy$MeanDecreaseAccuracy, decreasing = TRUE),
][1:20, ]
df_accuracy$Taxa <- factor(df_accuracy$Taxa, levels = df_accuracy$Taxa)
mda_plot <- ggplot(data = df_accuracy, aes(x = Taxa, y = MeanDecreaseAccuracy)) +
theme_bw()
mda_plot <- mda_plot + geom_bar(stat = "identity", fill = "darkblue", width = 0.5)
mda_plot <- mda_plot + theme(axis.text.x = element_text(angle = 90, hjust = 1,
vjust = 0.5))
mda_plot <- mda_plot + xlab("") + ylab("Mean Decrease in Accuracy") + coord_flip() +
theme(axis.text = element_text(size = 10, face = "bold"), axis.title = element_text(size = 16,
face = "bold"))
mda_plot

data.rf_reg <- data.frame(otu_table(M.f.des))
data.rf$BMI <- sample_data(M.f.des)$BMI
BMI.reg <- randomForest(BMI ~ ., data = data.rf, importance = TRUE, proximity = TRUE,
na.action = na.omit)
# BMI.reg$importance View(BMI.reg$importance)
R version 3.4.1 (2017-06-30)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS Sierra 10.12.6
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.4/Resources/lib/libRlapack.dylib
locale:
[1] C
attached base packages:
[1] stats4 parallel stats graphics grDevices utils datasets
[8] methods base
other attached packages:
[1] randomForest_4.6-14 DESeq2_1.16.1
[3] SummarizedExperiment_1.6.5 DelayedArray_0.2.7
[5] matrixStats_0.54.0 Biobase_2.36.2
[7] GenomicRanges_1.28.6 GenomeInfoDb_1.12.3
[9] IRanges_2.10.5 S4Vectors_0.14.7
[11] BiocGenerics_0.22.1 vegan_2.5-4
[13] lattice_0.20-38 permute_0.9-5
[15] pheatmap_1.0.12 microbiomeSeq_0.1
[17] factoextra_1.0.5 cluster_2.0.7-1
[19] plotly_4.9.0 ggplot2_3.2.0
[21] phyloseq_1.25.2 kableExtra_1.1.0
[23] rmdformats_0.3.5 knitr_1.23
loaded via a namespace (and not attached):
[1] backports_1.1.4 uuid_0.1-2 Hmisc_4.2-0
[4] plyr_1.8.4 igraph_1.2.4 lazyeval_0.2.2
[7] sp_1.3-1 splines_3.4.1 BiocParallel_1.10.1
[10] rncl_0.8.3 digest_0.6.19 foreach_1.4.4
[13] htmltools_0.3.6 gdata_2.18.0 memoise_1.1.0
[16] checkmate_1.9.3 magrittr_1.5 annotate_1.54.0
[19] Biostrings_2.44.2 readr_1.3.1 gmodels_2.18.1
[22] prettyunits_1.0.2 colorspace_1.4-1 blob_1.1.1
[25] rvest_0.3.4 ggrepel_0.8.1 xfun_0.8
[28] dplyr_0.8.0.1 crayon_1.3.4 RCurl_1.95-4.12
[31] jsonlite_1.6 genefilter_1.58.1 phylobase_0.8.6
[34] survival_2.44-1.1 iterators_1.0.10 ape_5.3
[37] glue_1.3.1 gtable_0.3.0 zlibbioc_1.22.0
[40] XVector_0.16.0 webshot_0.5.1 seqinr_3.4-5
[43] questionr_0.7.0 adegraphics_1.0-15 scales_1.0.0
[46] DBI_1.0.0 miniUI_0.1.1.1 Rcpp_1.0.1
[49] viridisLite_0.3.0 xtable_1.8-4 progress_1.2.2
[52] spData_0.3.0 htmlTable_1.13.1 bit_1.1-14
[55] foreign_0.8-71 spdep_0.7-9 Formula_1.2-3
[58] htmlwidgets_1.3 httr_1.4.0 RColorBrewer_1.1-2
[61] acepack_1.4.1 pkgconfig_2.0.2 XML_3.98-1.20
[64] nnet_7.3-12 deldir_0.1-16 locfit_1.5-9.1
[67] labeling_0.3 AnnotationDbi_1.38.2 tidyselect_0.2.5
[70] rlang_0.3.1 reshape2_1.4.3 later_0.8.0
[73] munsell_0.5.0 adephylo_1.1-11 tools_3.4.1
[ reached getOption("max.print") -- omitted 52 entries ]